/* Table 3 Panels B and C: wage and employment regressions with high/low skilled and versatility */
libname edu '!userprofile\\Dropbox\Education\Replication\Data';

* get data;
proc sql;
  create table wage_enrolment as
  select a.*, b.log_bachelor, exp(b.log_bachelor_male)/exp(b.log_bachelor_female) as mf_ratio, b.year_ending
  from edu.wagesoc as a, edu.enrolment as b
  where 1 <= a.year - b.year_ending <= 7 and a.major = b.majorcode;
quit;

proc sql;
  create table regression2 as
  select distinct annual_wage, annual_wage_high, annual_wage_low, delta_tot_emp, delta_tot_emp_high, delta_tot_emp_low, year, major
  from edu.wagesoc;
quit;

proc sql;
  create table regression2 as
  select distinct a.*, b.log_mktcape_major, b.log_btme_major, b.log_agee_major, b.major
  from regression2 as a, edu.skew as b
  where a.year = b.year + 1 and a.major = b.major;
quit;

proc sql;
  create table enrolment_skew_return2 as
  select a.*, b.skew1e_major
  from regression2 as a, edu.skew as b
  where 3 <= a.year - b.year <= 7 and a.major = b.major and a.major = b.major and a.major ~= .;
quit;

proc sql;
  create table wage_enrolment1 as
  select distinct a.year, a.major, b.log_bachelor, b.mf_ratio
  from regression2 as a, wage_enrolment as b
  where a.year = b.year and 1 <= a.year - b.year_ending <= 2 and a.major = b.major;
quit;

* calculate the average;
proc sort data = enrolment_skew_return2; by year major major; run;

proc means data = enrolment_skew_return2 noprint;
  by year major major;
  var skew1e_major;
  output out = return2 mean = skew1e_major_avg2;
run;

proc sort data = wage_enrolment1; by year major; run;

proc means data = wage_enrolment1 noprint;
  by year major;
  var log_bachelor mf_ratio;
  output out = enrolment1 mean = log_bachelor_avg1 mf_ratio_avg1;
run;

* require all lags to be present;
data return2; set return2; if _FREQ_ < 5 then delete; run;

* create a dataset for regressions;
proc sql;
  create table regression2 as
  select a.*, b.skew1e_major_avg2
  from regression2 as a, return2 as b
  where a.year = b.year and a.major = b.major and a.major = b.major and a.major ~= .;
quit;

proc sql;
  create table regression2 as
  select a.*, b.log_bachelor_avg1, b.mf_ratio_avg1
  from regression2 as a, enrolment1 as b
  where a.year = b.year and a.major = b.major;
quit;

* add time-series measures;
proc sql;
  create table temp as
  select distinct a.*, b.tsmean5_major as tsmean_major_avg2, b.tsvol5_major as tsvol_major_avg2
  from regression2 as a left join edu.skew as b
  on a.year = b.year+3 and a.major = b.major;
quit;

data regression_avg2; set temp; run;

/* versatility, HHI of employment in different naics */
/* major-naics comes from BLS actual employment data */
data occ (keep = year major naics tot_emp);
  set edu.major_naics;
run;

proc sort data = occ; 
  by year major;
run;

proc means data = occ noprint;
  var tot_emp;
  by year major;
  output out = occ2 sum = sum_emp;
run;

proc sql;
  create table occ3 as
  select a.*, b.sum_emp
  from occ as a, occ2 as b
  where a.major = b.major and a.year = b.year;
quit;

data occ4;
  set occ3;
  frac = tot_emp/sum_emp;
  frac_sq = frac * frac;
run;

proc sort data = occ4; by year major; run;

proc means data = occ4 noprint;
  by year major;
  var frac_sq;
  output out = occ5 sum = hhi;
run;

proc sort data = occ5; by year; run;

proc means data = occ5 noprint; by year; var hhi; output out = occ6 mean = med_hhi; run;

proc sql; create table occ5 as select a.*, b.med_hhi from occ5 as a, occ6 as b where a.year = b.year; quit;

data occ5;
  set occ5;
  versatility = 0;
  if hhi < med_hhi then versatility = 1;
run;

proc sql;
  create table regression_avg2 as
  select a.*, b.versatility
  from regression_avg2 as a left join occ5 as b
  on a.major = b.major and a.year = b.year;
quit;


/* merge with wage data (average industry entry-level wage and average major entry-level wage) */
data wage (keep = year major annual_wage);
  set edu.wagesoc;
  if year < 1999 then delete;
run;

data wage2 (keep = year major annual_wage);
  set edu.compustat_wage;
  if year >= 1999 then delete;
  annual_wage = annual_wagee_major;
run;

proc append base = wage2 data = wage; run;

proc sql;
  create table regression_avg2 as
  select a.*, b.annual_wage as annual_wage_avg2, b.year
  from regression_avg2 as a left join wage2 as b
  on a.major = b.major and 
  a.year - b.year = 3;
quit;

* year fixed effects, major fixed effects, cluster std error by year;
data regression_avg2; 
  set regression_avg2;
  if major = 12 or major = 21 then delete;
run;

proc standard data = regression_avg2 out = regression_avg3 mean = 0 std = 1; run;

data regression_avg2; set regression_avg2; num = _N_; run;

data regression_avg3 (drop = year major annual_wage annual_wage_high annual_wage_low delta_tot_emp delta_tot_emp_high delta_tot_emp_low versatility); set regression_avg3; num = _N_; run;

proc sql;
  create table regression_avg2 as
  select a.*, b.year, b.major, b.annual_wage, b.annual_wage_high, b.annual_wage_low, b.delta_tot_emp, b.delta_tot_emp_high, b.delta_tot_emp_low, b.versatility
  from regression_avg3 as a, regression_avg2 as b
  where a.num = b.num;
quit;

data regression_avg2; 
  set regression_avg2;
  skew_versatility = skew1e_major_avg2 * versatility;
run;


proc surveyreg data = regression_avg2;
  class
  year 
  major
  ;
 cluster year;

  model annual_wage =
  skew1e_major_avg2

  skew_versatility
  versatility

  tsmean_major_avg2 
  tsvol_major_avg2
  log_bachelor_avg1
  mf_ratio_avg1
  annual_wage_avg2

  log_mktcape_major 
  log_btme_major 
  log_agee_major
  year 
  major
  /solution adjrsq;
run;

proc surveyreg data = regression_avg2;
  class
  year 
  major
  ;
 cluster year;

  model delta_tot_emp =
  skew1e_major_avg2

  skew_versatility
  versatility

  tsmean_major_avg2 
  tsvol_major_avg2
  log_bachelor_avg1
  mf_ratio_avg1
  annual_wage_avg2

  log_mktcape_major 
  log_btme_major 
  log_agee_major
  year 
  major
  /solution adjrsq;
run;


/* Panel B with high- and low-skilled occupations */
data regression_avg3; set regression_avg2; high = 0; annual_wage2 = annual_wage_low; delta_tot_emp2 = delta_tot_emp_low; run;

data high; set regression_avg2; high = 1; annual_wage2 = annual_wage_high; delta_tot_emp2 = delta_tot_emp_high; run;

proc append base = regression_avg3 data = high; run;

data regression_avg3; 
  set regression_avg3; 
  low = 1 - high;
  skew_low = skew1e_major_avg2 * low; 
  major2 = major + 100*high; 
run;

proc sort data = regression_avg3; by year major; run;

proc surveyreg data = regression_avg3;
  class
  year
  major2
  ;
  cluster year;

  model annual_wage2 =
 
  skew1e_major_avg2
  skew_low
  tsmean_major_avg2
  tsvol_major_avg2
  log_bachelor_avg1
  mf_ratio_avg1 
  annual_wage_avg2
  log_mktcape_major 
  log_btme_major log_agee_major

  year
 major2
  /solution adjrsq;
run;

proc surveyreg data = regression_avg3;
  class
  year
  major2 
  ;
  cluster year;

  model delta_tot_emp2 =

 skew1e_major_avg2
  skew_low
  tsmean_major_avg2
  tsvol_major_avg2
  log_bachelor_avg1
  mf_ratio_avg1 
  annual_wage_avg2
  log_mktcape_major 
  log_btme_major log_agee_major

year 
major2
  /solution adjrsq;
run;


